Important Library¶

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

Loading dataset¶

In [5]:
df=pd.read_csv(r"C:\Users\DELL\Downloads\diabetes.csv")

exploratory Data Analysis¶

In [13]:
# Display first 10 record of the data
df.head()
Out[13]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
In [14]:
# Display last 10 record of the data
df.tail()
Out[14]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
763 10 101 76 48 180 32.9 0.171 63 0
764 2 122 70 27 0 36.8 0.340 27 0
765 5 121 72 23 112 26.2 0.245 30 0
766 1 126 60 0 0 30.1 0.349 47 1
767 1 93 70 31 0 30.4 0.315 23 0
In [17]:
# Display randomly any number of record of the data
df.sample(5)
Out[17]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
394 4 158 78 0 0 32.9 0.803 31 1
633 1 128 82 17 183 27.5 0.115 22 0
248 9 124 70 33 402 35.4 0.282 34 0
180 6 87 80 0 0 23.2 0.084 32 0
99 1 122 90 51 220 49.7 0.325 31 1
In [19]:
# number of rows and column
df.shape
Out[19]:
(768, 9)
In [21]:
# list types of all columns
df.dtypes
Out[21]:
Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object
In [22]:
# findout if the dataset is contain null value or not
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
In [23]:
# Statistical summary
df.describe()
Out[23]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 0.471876 33.240885 0.348958
std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 0.331329 11.760232 0.476951
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.078000 21.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 0.243750 24.000000 0.000000
50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 0.372500 29.000000 0.000000
75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 0.626250 41.000000 1.000000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000
In [24]:
df.shape
Out[24]:
(768, 9)
In [27]:
df=df.drop_duplicates()
In [28]:
df.shape
Out[28]:
(768, 9)
In [29]:
# Count the null values
#checking the missing value in any column
#Dispplay number of null values in every column in dataset

df.isnull().sum()
Out[29]:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
In [30]:
df.columns
Out[30]:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

checking the no.of zeros values in dataset¶

In [34]:
print('NO.of zeros value  in Glucose',df[df['Glucose']==0].shape[0])
NO.of zeros value  in Glucose 5
In [35]:
print('NO.of zeros value  in BloodPressure',df[df['BloodPressure']==0].shape[0])
NO.of zeros value  in BloodPressure 35
In [36]:
print('NO.of zeros value  in SkinThickness',df[df['SkinThickness']==0].shape[0])
NO.of zeros value  in SkinThickness 227
In [37]:
print('NO.of zeros value  in Insulin',df[df['Insulin']==0].shape[0])
NO.of zeros value  in Insulin 374
In [38]:
print('NO.of zeros value  is BMI',df[df['BMI']==0].shape[0])
NO.of zeros value  is BMI 11

Replace no.of zeroes valurs with mean of that columns¶

In [39]:
df['Glucose']=df['Glucose'].replace(0,df['Glucose'].mean())
print('NO.of zeros value  in Glucose',df[df['Glucose']==0].shape[0])
NO.of zeros value  in Glucose 0
In [42]:
df['BloodPressure']=df['BloodPressure'].replace(0,df['BloodPressure'].mean())
df['SkinThickness']=df['SkinThickness'].replace(0,df['SkinThickness'].mean())
df['Insulin']=df['Insulin'].replace(0,df['Insulin'].mean())
df['BMI']=df['BMI'].replace(0,df['BMI'].mean())
In [43]:
df.describe()
Out[43]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 121.681605 72.254807 26.606479 118.660163 32.450805 0.471876 33.240885 0.348958
std 3.369578 30.436016 12.115932 9.631241 93.080358 6.875374 0.331329 11.760232 0.476951
min 0.000000 44.000000 24.000000 7.000000 14.000000 18.200000 0.078000 21.000000 0.000000
25% 1.000000 99.750000 64.000000 20.536458 79.799479 27.500000 0.243750 24.000000 0.000000
50% 3.000000 117.000000 72.000000 23.000000 79.799479 32.000000 0.372500 29.000000 0.000000
75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 0.626250 41.000000 1.000000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000

Data Visualization¶

In [80]:
f, ax = plt.subplots(1, 2, figsize=(10, 5))
df['Outcome'].value_counts().plot.pie(explode=[0, 0.1], autopct='%1.1f%%', ax=ax[0], shadow=True)
ax[0].set_title('Outcome')
ax[0].set_ylabel('')
sns.countplot(x='Outcome', data=df, ax=ax[1])
#sns.countplot(y='Outcome', data=df, ax=ax[1])
ax[1].set_title('Outcome')
N, P = df['Outcome'].value_counts()
print('Negative(0): ', N)
print('Positive(1): ', P)
plt.grid()
plt.show()
Negative(0):  500
Positive(1):  268
No description has been provided for this image
In [81]:
# Histogramof each feature
df.hist(bins=10,figsize=(10,10))
plt.show()
No description has been provided for this image
In [84]:
# scatter plot matix
from pandas.plotting import scatter_matrix
scatter_matrix(df,figsize=(20,20));
No description has been provided for this image
In [87]:
# pairplot
sns.pairplot(data=df,hue='Outcome')
plt.show()
No description has been provided for this image

Analysing the relationship between the variables¶

Correlation Analysis¶

In [95]:
# get correlation of each features in dataset
corrmat = df.corr()
top_corr_features = corrmat.index

plt.figure(figsize=(10, 10))
# Plot the heatmap
g = sns.heatmap(corrmat, annot=True, cmap='RdYlGn')
plt.show()
No description has been provided for this image

Split the dataframe into x and y¶

In [98]:
target_name='Outcome'
#sperate object for target feature
y=df[target_name]
# Seperate Object for input featyres
X=df.drop(target_name,axis=1)
In [99]:
X.head()
Out[99]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age
0 6 148.0 72.0 35.000000 79.799479 33.6 0.627 50
1 1 85.0 66.0 29.000000 79.799479 26.6 0.351 31
2 8 183.0 64.0 20.536458 79.799479 23.3 0.672 32
3 1 89.0 66.0 23.000000 94.000000 28.1 0.167 21
4 0 137.0 40.0 35.000000 168.000000 43.1 2.288 33
In [100]:
y.head()
Out[100]:
0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

Apply Feature Scaling¶

In [102]:
# Apply Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X)
SSx=scaler.transform(X)

Train Test split¶

In [103]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(SSx,y,test_size=0.2, random_state=7)
In [104]:
X_train.shape,y_train.shape
Out[104]:
((614, 8), (614,))
In [105]:
X_test.shape,y_test.shape
Out[105]:
((154, 8), (154,))

Build the classification Algorithm¶

  1. Logistic regression
In [108]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(solver='liblinear',multi_class='ovr')
lr.fit(X_train,y_train)
Out[108]:
LogisticRegression(multi_class='ovr', solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(multi_class='ovr', solver='liblinear')
  1. KneighboursClssifier(KNN)
In [109]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
knn.fit(X_train,y_train)
Out[109]:
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
  1. Nave-Bayes classfier
In [112]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)
Out[112]:
GaussianNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GaussianNB()
  1. Support Vector Machine (SVM)
In [117]:
from sklearn.svm import SVC
sv=SVC()
sv.fit(X_train,y_train)
Out[117]:
SVC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC()
  1. Decision Tree
In [118]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
dt.fit(X_train,y_train)
Out[118]:
DecisionTreeClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
  1. Random Forest
In [119]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(criterion='entropy')
rf.fit(X_train,y_train)
Out[119]:
RandomForestClassifier(criterion='entropy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(criterion='entropy')

Making Prediction¶

  1. making prediction on test by using logistic Regresiion
In [127]:
X_test.shape
Out[127]:
(154, 8)
In [121]:
lr_pred=lr.predict(X_test)
In [128]:
lr_pred.shape
Out[128]:
(154,)
  1. making prediction on test by using KNN
In [123]:
knn_pred=knn.predict(X_test)
  1. making prediction on test by using Naivie Bayes
In [125]:
nb_pred=nb.predict(X_test)
  1. making prediction on test by using SVM
In [130]:
sv_pred=sv.predict(X_test)
  1. making prediction on test by using Decision Tree
In [132]:
dt_pred=dt.predict(X_test)
  1. making prediction on test by using Random Forest
In [134]:
rf_pred=rf.predict(X_test)

Model Evaluation¶

  1. Train Score and Test Score
In [137]:
from sklearn.metrics import accuracy_score
print("train accuracy of Logistic Regression",lr.score(X_train,y_train)*100)
print("Accuracy (Test) score of LogisticRegression",lr.score(X_test,y_test)*100)
print("Accuracy(Test) score of Logistic Regression",accuracy_score(y_test,lr_pred)*100)
train accuracy of Logistic Regression 77.36156351791531
Accuracy (Test) score of LogisticRegression 77.27272727272727
Accuracy(Test) score of Logistic Regression 77.27272727272727
In [138]:
# knn
print("train accuracy of KNN",knn.score(X_train,y_train)*100)
print("Accuracy (Test) score of KNN",knn.score(X_test,y_test)*100)
print("Accuracy(Test) score of KNN",accuracy_score(y_test,knn_pred)*100)
train accuracy of KNN 81.10749185667753
Accuracy (Test) score of KNN 74.67532467532467
Accuracy(Test) score of KNN 74.67532467532467
In [141]:
#Navie-Bayes
print("train accuracy of  Navie Bayes",nb.score(X_train,y_train)*100)
print("Accuracy (Test) score of Navie Bayes",nb.score(X_test,y_test)*100)
print("Accuracy(Test) score of  Navie Bayes",accuracy_score(y_test,nb_pred)*100)
train accuracy of  Navie Bayes 74.2671009771987
Accuracy (Test) score of Navie Bayes 74.02597402597402
Accuracy(Test) score of  Navie Bayes 74.02597402597402
In [142]:
# SVM
print("train accuracy of SVM",sv.score(X_train,y_train)*100)
print("Accuracy (Test) score ofSVM",sv.score(X_test,y_test)*100)
print("Accuracy(Test) score of SVM",accuracy_score(y_test,sv_pred)*100)
train accuracy of SVM 81.92182410423453
Accuracy (Test) score ofSVM 83.11688311688312
Accuracy(Test) score of SVM 83.11688311688312
In [143]:
# Decision Tree
print("train accuracy of Decision Tree",dt.score(X_train,y_train)*100)
print("Accuracy (Test) score ofDecision Tree",dt.score(X_test,y_test)*100)
print("Accuracy(Test) score of Decision Tree",accuracy_score(y_test,dt_pred)*100)
train accuracy of Decision Tree 100.0
Accuracy (Test) score ofDecision Tree 80.51948051948052
Accuracy(Test) score of Decision Tree 80.51948051948052
In [146]:
# Random Forest
print("train accuracy of  Random Forest",rf.score(X_train,y_train)*100)
print("Accuracy (Test) score of Random Forest",rf.score(X_test,y_test)*100)
print("Accuracy(Test) score of  Random Forest",accuracy_score(y_test,rf_pred)*100)
train accuracy of  Random Forest 100.0
Accuracy (Test) score of Random Forest 79.22077922077922
Accuracy(Test) score of  Random Forest 79.22077922077922

Confusion Matrix¶

In [148]:
from sklearn.metrics import classification_report,confusion_matrix
# confusion Matrix of Logistic Regression
cm=confusion_matrix(y_test,lr_pred)
cm
Out[148]:
array([[86, 11],
       [24, 33]], dtype=int64)
In [151]:
sns.heatmap(confusion_matrix(y_test,lr_pred),annot=True,fmt="d")
Out[151]:
<Axes: >
No description has been provided for this image
In [155]:
TN=cm[0,0]
FP=cm[0,1]
FN=cm[1,0]
TP=cm[1,1]
In [156]:
TN,FP,FN,TP
Out[156]:
(86, 11, 24, 33)
In [213]:
# making the confusion matrix of the Logistic Regression
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve
cm=confusion_matrix(y_test,lr_pred)

print('TN-True Negative {} '.format(cm[0,0]))
print('FP-False Positive {} '. format(cm[0,1]))
print('FN-false Negative {} '. format(cm[1,0]))
print('TP-True Positive {} '.format(cm[1,1]))
print('Accuracy rate {} '.format(np.divide(np.sum([cm[0,0],cm[1,1]]),np.sum(cm))*100))
print('misclassification Rate:{} '.format(np.divide(np.sum([cm[0,1],cm[1,0]]),np.sum(cm))*100))
      
TN-True Negative 86 
FP-False Positive 11 
FN-false Negative 24 
TP-True Positive 33 
Accuracy rate 77.27272727272727 
misclassification Rate:22.727272727272727 
In [214]:
77.27272727272727+22.727272727272727 
Out[214]:
100.0
In [215]:
import matplotlib.pyplot as plt
import numpy as np

plt.clf()
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['0', '1']
plt.title('Confusion Matrix of the logistic Regression')
plt.ylabel('Actual (true) values')
plt.xlabel('Predicted values')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [['TN', 'FP'], ['FN', 'TP']]
for i in range(2):
    for j in range(2):
        plt.text(j, i, str(s[i][j]) + "-" + str(cm[i][j]))
plt.show()
No description has been provided for this image
In [216]:
pd.crosstab(y_test,lr_pred,margins=False)
Out[216]:
col_0 0 1
Outcome
0 86 11
1 24 33
In [217]:
pd.crosstab(y_test,lr_pred,margins=True)
Out[217]:
col_0 0 1 All
Outcome
0 86 11 97
1 24 33 57
All 110 44 154
In [218]:
pd.crosstab(y_test,lr_pred,rownames=['Actual values'],colnames=['Predicted values'],margins=True)
Out[218]:
Predicted values 0 1 All
Actual values
0 86 11 97
1 24 33 57
All 110 44 154

Precision(PPV-Positive Predictive Value)¶

In [219]:
TP,FP
Out[219]:
(33, 11)
In [220]:
Precision=TP/(TP+FP)
Precision
Out[220]:
0.75
In [221]:
33/(33+11)
Out[221]:
0.75
In [222]:
# print precision score
precision_Score=TP/float(TP+FP)*100
print('Precision score:{0:0.4f}'.format(precision_Score))
Precision score:75.0000
In [223]:
from sklearn.metrics import precision_score

# Calculating precision score
precision = precision_score(y_test, lr_pred) * 100
print("Precision Score is:", precision)

# Calculating micro average precision score
micro_precision = precision_score(y_test, lr_pred, average='micro') * 100
print("Micro Average Precision Score is:", micro_precision)


# Calculating macro-average precision score
macro_precision = precision_score(y_test, lr_pred, average='macro') * 100
print("Macro Average Precision Score is:", macro_precision)



# Calculating weighted precision score
weighted_precision = precision_score(y_test, lr_pred, average='weighted') * 100
print("Weighted  Average Precision Score is:", weighted_precision)

# Calculate precision score on non weighted score
print("precision score on non weighted score is:",precision_score(y_test,lr_pred,average=None)*100)
Precision Score is: 75.0
Micro Average Precision Score is: 77.27272727272727
Macro Average Precision Score is: 76.5909090909091
Weighted  Average Precision Score is: 77.00413223140497
precision score on non weighted score is: [78.18181818 75.        ]
In [224]:
print('Classification Report of Logistic Regression:\n',classification_report(y_test,lr_pred,digits=4))
Classification Report of Logistic Regression:
               precision    recall  f1-score   support

           0     0.7818    0.8866    0.8309        97
           1     0.7500    0.5789    0.6535        57

    accuracy                         0.7727       154
   macro avg     0.7659    0.7328    0.7422       154
weighted avg     0.7700    0.7727    0.7652       154

Recall(True Positive rate(TRP))¶

In [225]:
recall_score=TP/float(TP+FN)*100
print('recall_score',recall_score)
recall_score 57.89473684210527
In [226]:
TP,FN
Out[226]:
(33, 24)
In [227]:
33/(33+24)
Out[227]:
0.5789473684210527
In [228]:
from sklearn.metrics import recall_score

# Calculate recall score
recall = recall_score(y_test, lr_pred) * 100

# Print recall score
print('Recall or Sensitivity Score:', recall)
Recall or Sensitivity Score: 57.89473684210527
In [229]:
micro_precision = recall_score(y_test, lr_pred, average='micro') * 100
print("Micro Average recall Score is:", micro_precision)


# Calculating macro-average precision score
macro_precision = recall_score(y_test, lr_pred, average='macro') * 100
print("Macro Average recall Score is:", macro_precision)



# Calculating weighted precision score
weighted_precision = recall_score(y_test, lr_pred, average='weighted') * 100
print("Weighted  Average recall Score is:", weighted_precision)

# Calculate precision score on non weighted score
print("recall score on non weighted score is:",recall_score(y_test,lr_pred,average=None)*100)
Micro Average recall Score is: 77.27272727272727
Macro Average recall Score is: 73.27726532826912
Weighted  Average recall Score is: 77.27272727272727
recall score on non weighted score is: [88.65979381 57.89473684]
In [230]:
print('Classification Report of Logistic Regression:\n',classification_report(y_test,lr_pred,digits=4))
Classification Report of Logistic Regression:
               precision    recall  f1-score   support

           0     0.7818    0.8866    0.8309        97
           1     0.7500    0.5789    0.6535        57

    accuracy                         0.7727       154
   macro avg     0.7659    0.7328    0.7422       154
weighted avg     0.7700    0.7727    0.7652       154

In [231]:
FPR = FP / (FP + TN)*100

# Print False Positive Rate
print("False Positive Rate (FPR) is:{0:0.4f}".format(FPR))
False Positive Rate (FPR) is:11.3402
In [232]:
FP,TN
Out[232]:
(11, 86)
In [233]:
11/(11+86)
Out[233]:
0.1134020618556701

Specificity¶

In [234]:
specificity = tn / (tn + fp)*100
print("Specificity (True Negative Rate) is:{0:0.4f}".format(specificity))
Specificity (True Negative Rate) is:88.6598

F1Score¶

In [235]:
from sklearn.metrics import f1_score

# Calculate F1 score
f1 = f1_score(y_test, lr_pred)*100

# Print F1 score
print("F1 Score is:", f1)
F1 Score is: 65.34653465346535
In [236]:
micro_precision = f1_score(y_test, lr_pred, average='micro') * 100
print("Micro Average f1 Score is:", micro_precision)


# Calculating macro-average precision score
macro_precision = recall_score(y_test, lr_pred, average='macro') * 100
print("Macro Average f1 Score is:", macro_precision)



# Calculating weighted precision score
weighted_precision = f1_score(y_test, lr_pred, average='weighted') * 100
print("Weighted  Average f1 Score is:", weighted_precision)

# Calculate precision score on non weighted score
print("f1 score on non weighted score is:",f1_score(y_test,lr_pred,average=None)*100)
Micro Average f1 Score is: 77.27272727272727
Macro Average f1 Score is: 73.27726532826912
Weighted  Average f1 Score is: 76.52373933045479
f1 score on non weighted score is: [83.09178744 65.34653465]

Classification report on logistic regression¶

In [237]:
print('Classification Report of Logistic Regression:\n',classification_report(y_test,lr_pred,digits=4))
Classification Report of Logistic Regression:
               precision    recall  f1-score   support

           0     0.7818    0.8866    0.8309        97
           1     0.7500    0.5789    0.6535        57

    accuracy                         0.7727       154
   macro avg     0.7659    0.7328    0.7422       154
weighted avg     0.7700    0.7727    0.7652       154

ROC(Reciever operating characteristic) curve and ROC AUC(Area Under Curve)¶

In [238]:
from sklearn.metrics import roc_auc_score

# Calculate AUC score
auc = roc_auc_score(y_test, lr_pred)

# Print AUC score
print("Area Under the Curve (AUC) is:", auc)
Area Under the Curve (AUC) is: 0.7327726532826913
In [301]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Assuming you have already computed fpr, tpr, and roc_auc

# Plot ROC curve
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve logistic regression')
plt.legend(loc="lower right")
plt.show()
No description has been provided for this image

confusion matrix of KNN¶

In [240]:
sns.heatmap(confusion_matrix(y_test,knn_pred),annot=True,fmt="d")
Out[240]:
<Axes: >
No description has been provided for this image
In [255]:
# making the confusion matrix of the KNN
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve
cm=confusion_matrix(y_test,knn_pred)

print('TN-True Negative {}'.format(cm[0,0]))
print('FP-False Positive {} '. format(cm[0,1]))
print('FN-false Negative {} '. format(cm[1,0]))
print('TP-True Positive {} '.format(cm[1,1]))
print('Accuracy rate {} '.format(np.divide(np.sum([cm[0,0],cm[1,1]]),np.sum(cm))*100))
print('misclassification Rate of KNN:{} '.format(np.divide(np.sum([cm[0,1],cm[1,0]]),np.sum(cm))*100))
      
TN-True Negative 82
FP-False Positive 15 
FN-false Negative 24 
TP-True Positive 33 
Accuracy rate 74.67532467532467 
misclassification Rate of KNN:25.324675324675322 
In [256]:
74.67532467532467+25.324675324675322 
Out[256]:
100.0
In [257]:
#classification report of KNN
print('Classification Report of KNN:\n',classification_report(y_test,knn_pred,digits=4))
Classification Report of KNN:
               precision    recall  f1-score   support

           0     0.7736    0.8454    0.8079        97
           1     0.6875    0.5789    0.6286        57

    accuracy                         0.7468       154
   macro avg     0.7305    0.7122    0.7182       154
weighted avg     0.7417    0.7468    0.7415       154

Area Under Curve Of KNN¶

In [258]:
# Area Under Curve
auc = roc_auc_score(y_test, knn_pred)

# Print AUC score
print("Area Under the Curve (AUC) is:", auc)
Area Under the Curve (AUC) is: 0.7121540965816603
In [259]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Assuming you have already computed fpr, tpr, and thresholds

# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--', label='Random Guessing')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve of KNN')
plt.legend()
plt.grid()
plt.show()
No description has been provided for this image

Confusion matrix of "Naive Bayes"¶

In [260]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have already trained your Naive Bayes classifier (nb) and made predictions (nb_pred)
# y_test contains the true labels

# Generate confusion matrix
cm = confusion_matrix(y_test, nb_pred)
print('TN-True Negative {}'.format(cm[0,0]))
print('FP-False Positive {} '. format(cm[0,1]))
print('FN-false Negative {} '. format(cm[1,0]))
print('TP-True Positive {} '.format(cm[1,1]))
print('Accuracy rate {} '.format(np.divide(np.sum([cm[0,0],cm[1,1]]),np.sum(cm))*100))
print('misclassification Rate of Naive Bayes:{} '.format(np.divide(np.sum([cm[0,1],cm[1,0]]),np.sum(cm))*100))
      
TN-True Negative 78
FP-False Positive 19 
FN-false Negative 21 
TP-True Positive 36 
Accuracy rate 74.02597402597402 
misclassification Rate of Naive Bayes:25.97402597402597 
In [261]:
74.02597402597402 +25.97402597402597 
Out[261]:
100.0
In [262]:
sns.heatmap(confusion_matrix(y_test,nb_pred),annot=True,fmt="d")
Out[262]:
<Axes: >
No description has been provided for this image

Classification Report of Naive Bayes¶

In [263]:
print('Classification Report of KNN:\n',classification_report(y_test,nb_pred,digits=4))
Classification Report of KNN:
               precision    recall  f1-score   support

           0     0.7879    0.8041    0.7959        97
           1     0.6545    0.6316    0.6429        57

    accuracy                         0.7403       154
   macro avg     0.7212    0.7179    0.7194       154
weighted avg     0.7385    0.7403    0.7393       154

Roc AUC Score OF Naive Bayes¶

In [264]:
auc = roc_auc_score(y_test, nb_pred)

# Print AUC score
print("Area Under the Curve (AUC) is:", auc)
Area Under the Curve (AUC) is: 0.7178513293543136
In [300]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Assuming you have already computed fpr, tpr, and thresholds

# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--', label='Random Guessing')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve of Naive bayes')
plt.legend()
plt.grid()
plt.show()
No description has been provided for this image

confusion matrix of SVM¶

In [270]:
sns.heatmap(confusion_matrix(y_test,sv_pred),annot=True,fmt="d")
Out[270]:
<Axes: >
No description has been provided for this image
In [290]:
#making the confusion matrix of svm
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have already trained your Naive Bayes classifier (nb) and made predictions (nb_pred)
# y_test contains the true labels

# Generate confusion matrix
cm = confusion_matrix(y_test, sv_pred)
print('TN-True Negative {}'.format(cm[0,0]))
print('FP-False Positive {} '. format(cm[0,1]))
print('FN-false Negative {} '. format(cm[1,0]))
print('TP-True Positive {} '.format(cm[1,1]))
print('Accuracy rate {} '.format(np.divide(np.sum([cm[0,0],cm[1,1]]),np.sum(cm))*100))
print('misclassification Rate of SVM:{} '.format(np.divide(np.sum([cm[0,1],cm[1,0]]),np.sum(cm))*100))
      
TN-True Negative 91
FP-False Positive 6 
FN-false Negative 20 
TP-True Positive 37 
Accuracy rate 83.11688311688312 
misclassification Rate of SVM:16.883116883116884 
In [277]:
# classification report of svm
print('Classification Report of svm:\n',classification_report(y_test,sv_pred,digits=4))
Classification Report of svm:
               precision    recall  f1-score   support

           0     0.8198    0.9381    0.8750        97
           1     0.8605    0.6491    0.7400        57

    accuracy                         0.8312       154
   macro avg     0.8401    0.7936    0.8075       154
weighted avg     0.8349    0.8312    0.8250       154

Roc AUC of Svm¶

In [279]:
from sklearn.metrics import roc_auc_score
auc = round(roc_auc_score(y_test, sv_pred)*100,2)
print("roc_auc_score of svc:", auc)
roc_auc_score of svc: 79.36
In [282]:
fpr, tpr, thresholds = roc_curve(y_test, sv_pred)
plt.plot(fpr, tpr, color='darkorange', label='ROC')
plt.plot([0, 1], [0, 1], color='navy',  linestyle='--',label='ROC curve(area=%0.2f)'%auc)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for SVM Classifier')
plt.legend()
plt.grid()
plt.show()
No description has been provided for this image

confusion matrix of Decision tree¶

In [283]:
sns.heatmap(confusion_matrix(y_test,dt_pred),annot=True,fmt="d")
Out[283]:
<Axes: >
No description has been provided for this image
In [285]:
#making the confusion matrix of Decision tree
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Generate confusion matrix
cm = confusion_matrix(y_test, dt_pred)
print('TN-True Negative {}'.format(cm[0,0]))
print('FP-False Positive {} '. format(cm[0,1]))
print('FN-false Negative {} '. format(cm[1,0]))
print('TP-True Positive {} '.format(cm[1,1]))
print('Accuracy rate {} '.format(np.divide(np.sum([cm[0,0],cm[1,1]]),np.sum(cm))*100))
print('misclassification Rate of decision tree:{} '.format(np.divide(np.sum([cm[0,1],cm[1,0]]),np.sum(cm))*100))
      
TN-True Negative 82
FP-False Positive 15 
FN-false Negative 15 
TP-True Positive 42 
Accuracy rate 80.51948051948052 
misclassification Rate of decision tree:19.480519480519483 
In [286]:
# classification report of Decision tree
print('Classification Report of Decision tree:\n',classification_report(y_test,dt_pred,digits=4))
Classification Report of Decision tree:
               precision    recall  f1-score   support

           0     0.8454    0.8454    0.8454        97
           1     0.7368    0.7368    0.7368        57

    accuracy                         0.8052       154
   macro avg     0.7911    0.7911    0.7911       154
weighted avg     0.8052    0.8052    0.8052       154

Roc AUC of Decision tree¶

In [296]:
from sklearn.metrics import roc_auc_score
auc = round(roc_auc_score(y_test, dt_pred)*100,2)
print("roc_auc_score of decision tree:", auc)
roc_auc_score of decision tree: 79.11
In [297]:
fpr, tpr, thresholds = roc_curve(y_test, dt_pred)
plt.plot(fpr, tpr, color='darkorange', label='ROC')
plt.plot([0, 1], [0, 1], color='navy',  linestyle='--',label='ROC curve(area=%0.2f)'%auc)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Decision tree')
plt.legend()
plt.grid()
plt.show()
No description has been provided for this image

confusion matrix of random forest¶

In [291]:
sns.heatmap(confusion_matrix(y_test,rf_pred),annot=True,fmt="d")
Out[291]:
<Axes: >
No description has been provided for this image
In [292]:
#making the confusion matrix of Decision tree
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Generate confusion matrix
cm = confusion_matrix(y_test, rf_pred)
print('TN-True Negative {}'.format(cm[0,0]))
print('FP-False Positive {} '. format(cm[0,1]))
print('FN-false Negative {} '. format(cm[1,0]))
print('TP-True Positive {} '.format(cm[1,1]))
print('Accuracy rate {} '.format(np.divide(np.sum([cm[0,0],cm[1,1]]),np.sum(cm))*100))
print('misclassification Rate of decision tree:{} '.format(np.divide(np.sum([cm[0,1],cm[1,0]]),np.sum(cm))*100))
      
TN-True Negative 84
FP-False Positive 13 
FN-false Negative 19 
TP-True Positive 38 
Accuracy rate 79.22077922077922 
misclassification Rate of decision tree:20.77922077922078 
In [293]:
# classification report of random forest
print('Classification Report of Decision tree:\n',classification_report(y_test,rf_pred,digits=4))
Classification Report of Decision tree:
               precision    recall  f1-score   support

           0     0.8155    0.8660    0.8400        97
           1     0.7451    0.6667    0.7037        57

    accuracy                         0.7922       154
   macro avg     0.7803    0.7663    0.7719       154
weighted avg     0.7895    0.7922    0.7896       154

roc auc of random forest¶

In [294]:
from sklearn.metrics import roc_auc_score
auc = round(roc_auc_score(y_test, rf_pred)*100,2)
print("roc_auc_score of random forest:", auc)
roc_auc_score of decision tree: 76.63
In [299]:
fpr, tpr, thresholds = roc_curve(y_test, rf_pred)
plt.plot(fpr, tpr, color='darkorange', label='ROC')
plt.plot([0, 1], [0, 1], color='navy',  linestyle='--',label='ROC curve(area=%0.2f)'%auc)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for random forest')
plt.legend()
plt.grid()
plt.show()
No description has been provided for this image

END¶

In [ ]: